#' ---
#' title: Reproduce Appendix Figures and Tables
#' author: Joe Ornstein
#' date: 2025-07-06
#' version: 0.2
#' ---


rm(list = ls())
cat('\n\n**Appendix Figures & Tables**\n\n')

library(tidyverse)

## Ablate Similarity Metrics (Table A1 & Figures A1/A2) ------------------

tb <- tribble(~Application, ~Predictors, ~'Brier Score', ~'Log Likelihood', ~fig_label,
              'Voter File', 'Both', NA, NA, 'fig_a1_a.png',
              'Voter File', 'Embedding Similarity Only', NA, NA, 'fig_a1_b.png',
              'Voter File', 'Lexical Similarity Only', NA, NA, 'fig_a1_c.png',
              'Multilingual Record Linkage', 'Both', NA, NA, 'fig_a2_a.png',
              'Multilingual Record Linkage', 'Embedding Similarity Only', NA, NA, 'fig_a2_b.png',
              'Multilingual Record Linkage', 'Lexical Similarity Only', NA, NA, 'fig_a2_c.png',)
model <- 'gpt-4o-2024-11-20'

for(i in 1:3){

  path_to_files <- 'data/candidate-merge/'

  fmla <- case_when(tb$Predictors[i] == 'Both' ~ 'match ~ sim + jw',
                    tb$Predictors[i] == 'Embedding Similarity Only' ~ 'match ~ sim',
                    tb$Predictors[i] == 'Lexical Similarity Only' ~ 'match ~ jw')

  # read in hand-labeled calibration test
  calibration <- readxl::read_xlsx(
    paste0(path_to_files, model, '/',
           fmla, '/l2_calibration_validated.xlsx')) |>
    filter(match_probability > 0.15, # only hand-labeled this set
           jw < 1) |>  # keep only the fuzzy matches
    mutate(match = as.numeric(hand_label == 'Yes'))

  # binned calibration plot
  subfig <- calibration |>
    group_by(cut(match_probability, seq(0.15, 1, 0.09))) |>
    summarize(avg_predicted_prob = mean(match_probability),
              avg_match_rate = mean(match),
              n()) |>
    ggplot(mapping = aes(x=avg_predicted_prob,
                         y=avg_match_rate)) +
    geom_point() +
    geom_abline(intercept = 0, slope = 1,
                linetype = 'dashed') +
    theme_bw() +
    labs(x = 'Average Predicted Probability',
         y = 'Average Match Rate') +
    scale_x_continuous(limits = c(0,1)) +
    scale_y_continuous(limits = c(0,1))

  ggsave(paste0('figures/', tb$fig_label[i], '.png'),
         plot = subfig,
         width = 6, height = 6)

  tb$`Brier Score`[i] <- mean((calibration$match_probability - calibration$match)^2)
  p <- abs(calibration$match - (1-calibration$match_probability))
  tb$`Log Likelihood`[i] <- sum(log(p))
}

# merge with truth
elections <- read_csv('raw/parlgov_elections.csv', progress = FALSE) |>
  # remove the English-speaking countries
  filter(!(country_name %in% c('Australia', 'New Zealand', 'Canada', 'United Kingdom', 'Ireland'))) |>
  # keep just the seats assigned to parties
  filter(!(party_name_english %in% c('one seat', 'one-seat', 'no seat', 'others', 'ethnic', 'no party affiliation'))) |>
  # just keep parliamentary elections (not European Parliament)
  filter(election_type == 'parliament') |>
  # keep only the parties that won seats
  filter(!is.na(seats), seats > 0) |>
  select(country_name, election_date, party_name_english, party_name, seats, left_right)

for(i in 4:6){

  path_to_files <- 'data/parlgov-calibration/'

  fmla <- case_when(tb$Predictors[i] == 'Both' ~ 'match ~ sim + jw',
                    tb$Predictors[i] == 'Embedding Similarity Only' ~ 'match ~ sim',
                    tb$Predictors[i] == 'Lexical Similarity Only' ~ 'match ~ jw')

  # load and bind merged datasets
  d <- list()
  j <- 1
  for(f in list.files(paste0('data/parlgov-calibration/', model, '/', fmla, '/'),
                      pattern = '\\.RData$',
                      full.names = TRUE)){
    load(f)
    d[[j]] <- df
    j <- j + 1
  }
  df <- bind_rows(d)

  df <- df |>
    select(country_name,
           election_date,
           party_name = A,
           party_name_english = B,
           match_probability, match) |>
    left_join(elections |> mutate(true_match = 1)) |>
    mutate(true_match = replace_na(true_match, 0))

  # calibration plot
  subfig <- df |>
    group_by(cut(match_probability, seq(0, 1, 0.1))) |>
    summarize(avg_predicted_prob = mean(match_probability),
              avg_match_rate = mean(true_match),
              n()) |>
    ggplot(mapping = aes(x=avg_predicted_prob,
                         y=avg_match_rate)) +
    geom_point() +
    geom_abline(intercept = 0, slope = 1,
                linetype = 'dashed') +
    theme_bw() +
    labs(x = 'Average Predicted Probability',
         y = 'Average Match Rate') +
    scale_x_continuous(limits = c(0,1)) +
    scale_y_continuous(limits = c(0,1))

  ggsave(paste0('figures/', tb$fig_label[i], '.png'),
         plot = subfig,
         width = 6, height = 6)

  tb$`Brier Score`[i] <- mean((df$match_probability - df$true_match)^2)
  p <- abs(df$true_match - (1-df$match_probability))
  tb$`Log Likelihood`[i] <- sum(log(p))
}

# save Table A1
tb$`Brier Score` <- round(tb$`Brier Score`, 3)
tb$`Log Likelihood` <- round(tb$`Log Likelihood`, 2)

tb |>
  select(-fig_label) |>
  xtable::xtable(type = 'latex',
                 caption = 'Estimated match probability calibration in two applications, varying model specification',
                 label = 'tab:calibration') |>
  print(file = 'tables/table_a1.tex')


## Plot similarity score distribution (Figure A4) ---------------------

load('data/candidate-merge/gpt-4o-2024-11-20/match ~ sim + jw/l2_fuzzylink_all_pairs.RData')

df <- df |>
  # note: 8.9% is the cutoff (pi) that maximizes expected F1 scores for Application 1
  mutate(match = if_else(is.na(match) & match_probability <= 0.089,
                         'No', match),
         match = if_else(is.na(match) & match_probability > 0.089,
                         'Yes', match),
         match_chr = factor(if_else(match == 'Yes',
                                    'Matches', 'Non-Matches'),
                            levels = c('Non-Matches', 'Matches'))) |>
  # convert to numeric
  mutate(match = as.numeric(match == 'Yes'))

fig_a4 <- ggplot(data = df,
       mapping = aes(x = sim,
                     y = jw,
                     color = match_chr)) +
  geom_point(alpha = 0.1) +
  # facet_wrap(~match_chr) +
  theme_bw() +
  labs(x = 'Embedding Similarity',
       y = 'Jaro-Winkler Similarity',
       color = NULL) +
  scale_color_manual(values = c('black', 'red')) +
  coord_equal()

ggsave(filename = 'figures/fig_a4.png',
       plot = fig_a4,
       width = 8, height = 5)


## Active vs. Passive Labeling (Table A2) ------------------------

passive_learner <- function(d, p){

  # label a subset
  d$truth <- d$match
  d$match <- NA
  matches_to_label <- sample(1:nrow(d),
                             size = floor(nrow(d) * p),
                             replace = FALSE)
  d$match[matches_to_label] <- ifelse(
    d$truth[matches_to_label] == 1, 'Yes', 'No')

  fit <- glm(as.numeric(match == 'Yes') ~ sim + jw,
             data = d,
             family = 'binomial')

  d$match_probability <- predict(fit, d,
                                 type = 'response')

  # get cutoff in estimated match probability that maximizes expected F1 score (denoted pi in the manuscript)
  get_cutoff <- function(df, fit){
    df <- df[order(df$match_probability),]
    df$expected_false_negatives <- cumsum(df$match_probability)
    df$identified_false_negatives <- cumsum(ifelse(is.na(df$match), 0, as.numeric(df$match == 'Yes')))
    df <- df[order(-df$match_probability),]
    df$expected_false_positives <- cumsum(1-df$match_probability)
    df$identified_false_positives <- cumsum(1 - ifelse(is.na(df$match), 1, as.numeric(df$match == 'Yes')))
    df$expected_true_positives <- cumsum(df$match_probability)
    df$identified_true_positives <- cumsum(ifelse(is.na(df$match), 0, as.numeric(df$match == 'Yes')))

    total_labeled_true <- sum(df$match == 'Yes', na.rm = TRUE)

    df$tp <- total_labeled_true + (df$expected_true_positives - df$identified_true_positives)
    df$fp <- df$expected_false_positives - df$identified_false_positives
    df$fn <- df$expected_false_negatives - df$identified_false_negatives

    df$expected_recall <- df$tp / (df$tp + df$fn)
    df$expected_precision <- df$tp / (df$tp + df$fp)
    df$expected_f1 = 2 * (df$expected_recall * df$expected_precision) /
      (df$expected_recall + df$expected_precision)

    return(df$match_probability[which.max(df$expected_f1)])
  }

  cutoff <- get_cutoff(d, fit)

  d <- d |>
    mutate(guess = if_else(match_probability > cutoff,
                           if_else(is.na(match), 'Yes', match),
                           if_else(is.na(match), 'No', match)))

  true_positives <- sum(d$guess == 'Yes' & d$truth == 1)
  false_negatives <- sum(d$guess == 'No' & d$truth == 1)
  false_positives <- sum(d$guess == 'Yes' & d$truth == 0)

  # return Number of Labels, Precision, and Recall
  return(c(
    prettyNum( sum(!is.na(d$match)), big.mark = ','), # Num. Labeled
    paste0( round(true_positives / (true_positives + false_positives), 3)*100, '%' ), #Precision
    paste0( round(true_positives / (true_positives + false_negatives), 3)*100, '%' ) # Recall
  ))
}

set.seed(1337)

tb_a2 <- tribble(~Method, ~'Number of Labels', ~Precision, ~Recall,
                 'fuzzylink', '3,259', '95.8%', '95.8%',
                 'Passive Labeling (1%)', !!!passive_learner(df, p = 0.01),
                 'Passive Labeling (2.5%)', !!!passive_learner(df, p = 0.025),
                 'Passive Labeling (5%)', !!!passive_learner(df, p = 0.05),
                 'Passive Labeling (10%)', !!!passive_learner(df, p = 0.1),
                 'Passive Labeling (25%)', !!!passive_learner(df, p = 0.25),
                 'Passive Labeling (50%)', !!!passive_learner(df, p = 0.5),
                 'Passive Labeling (75%)', !!!passive_learner(df, p = 0.75))

tb_a2 |>
  xtable::xtable(type = 'latex',
                 caption = 'Estimated performance for variants of the model with \"passive\" rather than active labeling',
                 label = 'tab:passive') |>
  print(file = 'tables/table_a2.tex')


## Mappings between embedding similarity and match probability (Figure A3) -----------------

# Candidate Names
load('data/candidate-merge/gpt-4o-2024-11-20/match ~ sim + jw/l2_fuzzylink_all_pairs.RData')

fit <- glm(as.numeric(match == 'Yes') ~ sim,
           data = df,
           family = 'binomial')

plot_data <- tibble(sim = seq(0,1,0.001),
                    application = 'Candidate Names')
plot_data$prob <- predict(fit, plot_data, type = 'response')

# City Names
load('data/cities-merge/cities_fuzzylink_all-pairs.RData')

fit <- glm(as.numeric(match == 'Yes') ~ sim,
           data = df,
           family = 'binomial')

add_to_plot_data <- tibble(sim = seq(0,1,0.001),
                           application = 'City Names')
add_to_plot_data$prob <- predict(fit, add_to_plot_data,
                                 type = 'response')

plot_data <- bind_rows(plot_data, add_to_plot_data)


# Party Names (Austria, Israel, and Spain)
cns <- c('Austria', 'Israel', 'Spain')

for(cn in cns){
  load(paste0('data/parlgov-calibration/gpt-4o-2024-11-20/match ~ sim + jw/', cn, '.RData'))

  # merge with truth
  df <- df |>
    left_join(elections |>
                select(A=party_name,
                       B=party_name_english,
                       election_date) |>
                mutate(truth = 1)) |>
    mutate(truth = if_else(is.na(truth), 0, truth))

  fit <- glm(truth ~ sim,
             data = df,
             family = 'binomial')

  add_to_plot_data <- tibble(sim = seq(0,1,0.001),
                             application = paste0('Party Names (', cn, ')'))
  add_to_plot_data$prob <- predict(fit, add_to_plot_data,
                                   type = 'response')

  plot_data <- bind_rows(plot_data, add_to_plot_data)
}

fig_a3 <- ggplot(data = plot_data,
       mapping = aes(x = sim,
                     y = prob,
                     color = application)) +
  geom_line() +
  labs(x = 'Embedding Similarity',
       y = 'Match Probability',
       color = 'Application') +
  scale_color_brewer(palette = 'Set1') +
  theme_bw()

ggsave('figures/fig_a3.png',
       plot = fig_a3,
       width = 8, height = 5)


## Open-source Mistral variants (Figures A5 & A6) ---------------------

model <- 'open-mixtral-8x22b'
fmla <- match ~ sim + jw
d <- list()
i <- 1
for(f in list.files(paste0('data/parties-merge/', model, '/', deparse(fmla), '/'),
                    pattern = '\\.RData$',
                    full.names = TRUE)){
  load(f)
  d[[i]] <- df
  i <- i + 1
}
df <- bind_rows(d)

actual <- elections |>
  # compute seat-share weighted LR of parliament
  group_by(country_name, election_date) |>
  summarize(lr_actual = weighted.mean(left_right, seats, na.rm = TRUE))

estimate <- df |>
  # for each party, estimate their lr score based on match probability
  group_by(A, country_name, election_date) |>
  summarize(seats = unique(seats),
            lr_hat = weighted.mean(left_right, match_probability, na.rm = TRUE)) |>
  # estimate seat-share weighted LR of parliament
  group_by(country_name, election_date) |>
  filter(!is.na(seats)) |>
  summarize(lr_estimated = weighted.mean(lr_hat, seats, na.rm = TRUE)) |>
  ungroup()

parliaments <- left_join(actual, estimate)

fig_a5 <- parliaments |>
  mutate(abs_error = abs(lr_estimated - lr_actual)) |>
  ggplot(mapping = aes(x=lr_actual, y=lr_estimated, color = abs_error > 1)) +
  geom_point(alpha = 0.7) +
  geom_abline(intercept = 0, slope = 1, linetype = 'dashed') +
  scale_x_continuous(limits = c(3.5,7.5)) +
  scale_y_continuous(limits = c(3.5,7.5)) +
  coord_equal() +
  theme_bw() +
  labs(x = 'Seat-Weighted Ideology (Actual)',
       y = 'Seat-Weighted Ideology (Estimated)') +
  scale_color_manual(values = c('black', 'red')) +
  theme(legend.position = 'none')

ggsave(filename = 'figures/fig_a5.png',
       plot = fig_a5,
       width = 8, height = 5)

fig_a6 <- parliaments |>
  ggplot() +
  geom_point(mapping = aes(x=election_date, y=lr_estimated)) +
  # geom_line(mapping = aes(x=election_date, y=lr_estimated)) +
  geom_line(mapping = aes(x=election_date, y=lr_actual)) +
  facet_wrap(~country_name, scales = 'free', ncol = 5) +
  theme_bw() +
  labs(x = 'Election Date', y = 'Seat Share-Weighted Ideology of Parliament')

ggsave(filename = 'figures/fig_a6.png',
       plot = fig_a6,
       width = 8, height = 8)



## ROC curves for fuzzy string metrics (Figure A7) -------------

# evaluation_set is 4,000 organization name pairs from Kaufman & Klevs (2022)
# with crowd-coded labels and similarity metrics
library(pROC)
load('data/organizations-merge/evaluation_set_with_embeddings.RData')

pdf("figures/fig_a7.pdf")
plot(roc(d, response=label, predictor=jaccard),
     print.auc = TRUE,
     col = "black",  xlim=c(1,0),
     xlab = 'False Positive Rate',
     ylab = 'True Positive Rate',
     legacy.axes = TRUE
)
plot(roc(d, response=label, predictor=hitl_score), print.auc = TRUE,
     col = "red", print.auc.y = .4, add = TRUE)
plot(roc(d, response=label, predictor=sim), print.auc = TRUE,
     col = "blue", print.auc.y = .3, add = TRUE)

text(x = rep(0.3, 3), y=c(.485,.385,.285),
     labels = c("Jaccard", "AFSM", "Embeddings"), pos=4,
     col=c("black", "red", "blue"))
dev.off()
